home *** CD-ROM | disk | FTP | other *** search
- /*
- ** Copyright (C) 1995, Enterprise Integration Technologies Corp.
- ** All Rights Reserved.
- ** Kevin Hughes, kevinh@eit.com
- ** 3/11/94
- */
-
- #include "swish.h"
- #include "search.h"
-
- /* The main search function.
- ** Parentheses are stripped out, things made lowercase,
- ** extra blanks removed, etc.
- */
-
- void search(words, indexlist, structure)
- char *words;
- struct swline *indexlist;
- int structure;
- {
- int i, j;
- float num;
- char word[MAXWORDLEN];
- struct result *resultlist;
- struct sortresult *sortresultlist;
- struct swline *tmplist;
- FILE *fp;
-
- searchwordlist = NULL;
-
- for (i = j = 0; words[i] != '\0' && words[i] != '\n'; i++) {
- if (isspace(words[i]) || words[i] == '(' ||
- words[i] == ')') {
- if (j) {
- word[j] = '\0';
- searchwordlist = (struct swline *)
- addswline(searchwordlist, (char *)
- convertentities(word));
- j = 0;
- }
- if (words[i] == '(') {
- searchwordlist = (struct swline *)
- addswline(searchwordlist, "(");
- }
- if (words[i] == ')') {
- searchwordlist = (struct swline *)
- addswline(searchwordlist, ")");
- }
- }
- else
- word[j++] = tolower(words[i]);
- }
- if (j) {
- word[j] = '\0';
- searchwordlist = (struct swline *)
- addswline(searchwordlist, (char *) convertentities(word));
- }
-
- printf("%s\n", INDEXHEADER);
- if (words[0] == '\0') {
- printf("err: no search words specified\n.\n");
- exit(0);
- }
-
- printf("search words:");
- tmplist = searchwordlist;
- while (tmplist != NULL) {
- printf(" %s", tmplist->line);
- tmplist = tmplist->next;
- }
- putchar('\n');
-
- while (indexlist != NULL) {
-
- commonerror = bigrank = 0;
-
- if ((fp = fopen(indexlist->line, "r")) == NULL) {
- printf("# Name: unknown index\n");
- printf("err: could not open index file\n.\n");
- exit(0);
- }
-
- if (!isokindexheader(fp)) {
- printf("err: the index file format is unknown\n.\n");
- exit(0);
- }
-
- getheader(fp);
-
- if (!getindexfilenum(fp)) {
- printf("err: the index file is empty\n.\n");
- exit(0);
- }
-
- readoffsets(fp);
- readstopwords(fp);
- readfileoffsets(fp);
-
- resultlist = NULL;
- tmplist = searchwordlist;
- tmplist = (struct swline *) fixnot(tmplist, fp);
- searchwordlist = (struct swline *) expandstar(tmplist, fp);
- resultlist = (struct result *) parseterm(fp, 0);
-
- sortresultlist = NULL;
- while (resultlist != NULL) {
- if (resultlist->structure & structure)
- sortresultlist = (struct sortresult *)
- addsortresult(sortresultlist, resultlist->rank,
- lookupfile(resultlist->filenum, fp));
- resultlist = resultlist->next;
- }
-
- fclose(fp);
-
- if (sortresultlist == NULL) {
- if (commonerror)
- printf("err: a word is too common\n");
- else
- printf("err: no results\n");
- }
- else {
- if (bigrank)
- num = 1000.0 / (float) bigrank;
- else
- num = 1000;
- printsortedresults(sortresultlist, num);
- }
-
- searchwordlist = tmplist;
- indexlist = indexlist->next;
-
- }
-
- printf(".\n");
- }
-
- /* This puts parentheses in the right places around not structures
- ** so the parser can do its thing correctly.
- */
-
- struct swline *fixnot(sp)
- struct swline *sp;
- {
- int openparen, hasnot;
- char lastline[MAXWORDLEN];
- struct swline *tmpp, *newp;
- #ifdef DEBUG
- struct swline *newp2;
- #endif
-
- tmpp = sp;
- newp = NULL;
-
- openparen = 0;
- hasnot = 0;
- lastline[0] = '\0';
- while (tmpp != NULL) {
- if ((tmpp->line)[0] == '(')
- openparen++;
- if ((tmpp->line)[0] == ')')
- openparen--;
- if (!strcmp(tmpp->line, "not") && lastline[0] != '\0' &&
- lastline[0] != '(') {
- hasnot = 1;
- newp = (struct swline *) addswline(newp, "(");
- }
- else if (hasnot && !openparen) {
- hasnot = 0;
- newp = (struct swline *) addswline(newp, tmpp->line);
- newp = (struct swline *) addswline(newp, ")");
- strcpy(lastline, tmpp->line);
- tmpp = tmpp->next;
- continue;
- }
- newp = (struct swline *) addswline(newp, tmpp->line);
- strcpy(lastline, tmpp->line);
- tmpp = tmpp->next;
- }
-
- #ifdef DEBUG
- newp2 = newp;
- while (newp2 != NULL) {
- printf("%s ", newp2->line);
- newp2 = newp2->next;
- }
- putchar('\n');
- #endif
-
- return newp;
- }
-
- /* Expands words with asterisks as wildcards into a series of
- ** "or" searches. Terms like "quick\*" are expanded into
- ** "quicktime or quickly", etc.
- */
-
- struct swline *expandstar(sp, fp)
- struct swline *sp;
- FILE *fp;
- {
- int i, firsttime, gotstar;
- char foundword[MAXWORDLEN], searchword[MAXWORDLEN];
- struct swline *newp;
-
- newp = NULL;
- while (sp != NULL) {
- strcpy(searchword, sp->line);
- if (searchword[0] != '*' && strchr(searchword, '*')) {
- for (i = gotstar = 0; searchword[i]; i++)
- if (gotstar)
- searchword[i] = '\0';
- else if (searchword[i] == '*') {
- searchword[i] = '\0';
- gotstar = 1;
- }
- firsttime = 0;
- do {
- strcpy(foundword, getmatchword(searchword,
- fp, firsttime));
- if (strcmp(foundword, NOWORD)) {
- if (firsttime)
- newp = (struct swline *)
- addswline(newp, "or");
- newp = (struct swline *)
- addswline(newp, foundword);
- }
- else {
- if (!firsttime)
- newp = (struct swline *)
- addswline(newp, NOWORD);
- break;
- }
- firsttime++;
- } while (strcmp(foundword, NOWORD));
- }
- else {
- newp = (struct swline *) addswline(newp,
- searchword);
- }
- sp = sp->next;
- }
- return newp;
- }
-
- /* If firsttime is 1, returns the first match to a beginnng of a word.
- ** Else if it's 0, returns the next match, until nothing is found,
- ** in which case NULL is returned.
- */
-
- char *getmatchword(word, fp, firsttime)
- char *word;
- FILE *fp;
- int firsttime;
- {
- int i, c, found;
- char *d;
- static char fileword[MAXWORDLEN];
-
- if (!firsttime) {
- for (i = found = 0; indexchars[i] != '\0'; i++)
- if (word[0] == indexchars[i]) {
- fseek(fp, offsets[i], 0);
- found = 1;
- }
- if (!found)
- return NOWORD;
- }
-
- if (offsets[STOPWORDPOS] == ftell(fp))
- return NOWORD;
- for (i = 0; (c = fgetc(fp)) != 0; ) {
- if (c == ':') {
- fileword[i] = '\0';
- i = 0;
- while ((c = fgetc(fp)) != 0)
- ;
- if (fileword[0] != word[0])
- return NOWORD;
- d = (char *) strstr(fileword, word);
- if (d != NULL && d == &fileword[0])
- return fileword;
- else {
- if (offsets[STOPWORDPOS] == ftell(fp))
- return NOWORD;
- }
- }
- else
- fileword[i++] = c;
- }
- return NOWORD;
- }
-
- /* Reads and prints the header of an index file.
- */
-
- void getheader(fp)
- FILE *fp;
- {
- int c;
- char line[MAXSTRLEN];
-
- fgets(line, MAXSTRLEN, fp);
- while (1) {
- c = fgetc(fp);
- ungetc(c, fp);
- if (c == '#') {
- fgets(line, MAXSTRLEN, fp);
- printf("%s", line);
- continue;
- }
- else
- break;
- }
- fseek(fp, 0, 0);
- }
-
- /* Reads the offsets in the index file so word lookup is faster.
- */
-
- void readoffsets(fp)
- FILE *fp;
- {
- int c, i, k;
- long j, num;
-
- for (i = 0; i < MAXCHARS; i++)
- offsets[i] = 0;
-
- fseek(fp, 0, 0);
- while (1) {
- c = fgetc(fp);
- if (c == '#') {
- do {
- c = fgetc(fp);
- } while (c && c != '\n');
- continue;
- }
- else
- break;
- }
-
- j = 0;
- while (c != EOF && c != '\n') {
- k = MAXLONGLEN;
- for (num = 0; c && isdigit(c) && k--; ) {
- num = (num * 10) + (c - '0');
- c = fgetc(fp);
- }
- offsets[j++] = num;
- }
- }
-
- /* Reads the stopwords in the index file.
- */
-
- void readstopwords(fp)
- FILE *fp;
- {
- int i, c;
- char word[MAXWORDLEN];
-
- fseek(fp, offsets[STOPWORDPOS], 0);
- for (i = 0; (c = fgetc(fp)) != '\n' && c != EOF; )
- if (!isspace(c))
- word[i++] = c;
- else {
- word[i] = '\0';
- addstophash(word);
- i = 0;
- }
- }
-
- /* Reads the file offset table in the index file.
- */
-
- void readfileoffsets(fp)
- FILE *fp;
- {
- int j, k, c;
- long num;
-
- j = 0;
- fseek(fp, offsets[FILEOFFSETPOS], 0);
- c = fgetc(fp);
- while (c != EOF && c != '\n') {
- k = MAXLONGLEN;
- for (num = 0; c != EOF && isdigit(c) && k--; ) {
- num = (num * 10) + (c - '0');
- c = fgetc(fp);
- }
- addtofilehashlist(j++, num);
- }
- }
-
- /* The recursive parsing function.
- ** This was a headache to make but ended up being surprisingly easy. :)
- ** parseone tells the function to only operate on one word or term.
- */
-
- struct result *parseterm(fp, parseone)
- FILE *fp;
- int parseone;
- {
- int rulenum;
- char word[MAXWORDLEN];
- struct result *rp, *newrp;
-
- rp = NULL;
-
- rulenum = OR_RULE;
- while (searchwordlist != NULL) {
- strcpy(word, searchwordlist->line);
-
- if (rulenum == NO_RULE)
- rulenum = DEFAULT_RULE;
- if (isunaryrule(word)) {
- searchwordlist = searchwordlist->next;
- rp = (struct result *) parseterm(fp, 1);
- rp = (struct result *) notresultlist(rp, fp);
- continue;
- }
- else if (isbooleanrule(word)) {
- rulenum = getrulenum(word);
- searchwordlist = searchwordlist->next;
- continue;
- }
-
- if (word[0] == '(') {
-
- searchwordlist = searchwordlist->next;
- newrp = (struct result *) parseterm(fp, 0);
-
- if (rulenum == AND_RULE)
- rp = (struct result *)
- andresultlists(rp, newrp);
- else if (rulenum == OR_RULE)
- rp = (struct result *)
- orresultlists(rp, newrp);
- if (searchwordlist == NULL)
- break;
-
- continue;
-
- }
- else if (word[0] == ')') {
- searchwordlist = searchwordlist->next;
- break;
- }
-
- rp = (struct result *) operate(rp, rulenum, word, fp);
-
- if (parseone) {
- searchwordlist = searchwordlist->next;
- break;
- }
- rulenum = NO_RULE;
-
- searchwordlist = searchwordlist->next;
- }
-
- return rp;
- }
-
- /* Looks up a word in the index file -
- ** it calls getfileinfo(), which does the real searching.
- */
-
- struct result *operate(rp, rulenum, word, fp)
- struct result *rp;
- int rulenum;
- char *word;
- FILE *fp;
- {
- int i, found;
- struct result *newrp, *returnrp;
-
- if (isstopword(word) && !isrule(word)) {
- if (rulenum == OR_RULE && rp != NULL)
- return rp;
- else
- commonerror = 1;
- }
-
- for (i = found = 0; indexchars[i] != '\0'; i++)
- if (word[0] == indexchars[i]) {
- fseek(fp, offsets[i], 0);
- found = 1;
- }
- if (!found) {
- if (rulenum == AND_RULE)
- return NULL;
- else if (rulenum == OR_RULE)
- return rp;
- }
-
- newrp = (struct result *) getfileinfo(word, fp);
- if (rulenum == AND_RULE)
- returnrp = (struct result *) andresultlists(rp, newrp);
- else if (rulenum == OR_RULE)
- returnrp = (struct result *) orresultlists(rp, newrp);
- else if (rulenum == NOT_RULE)
- returnrp = (struct result *) notresultlist(newrp, fp);
- return returnrp;
- }
-
- /* Looks up a file name in the index file.
- */
-
- char *lookupfile(filenum, fp)
- int filenum;
- FILE *fp;
- {
- static char line[MAXSTRLEN];
-
- fseek(fp, getfilenum(decodefilenum(filenum) - 1), 0);
- fgets(line, MAXSTRLEN, fp);
-
- return line;
- }
-
- /* Finds a word and returns its corresponding file and rank information list.
- ** If not found, NULL is returned.
- */
-
- struct result *getfileinfo(word, fp)
- char *word;
- FILE *fp;
- {
- int i, c, x, countnum, rank, filenum, structure;
- char fileword[MAXWORDLEN];
- struct result *rp;
-
- rp = NULL;
-
- for (i = 0; (c = fgetc(fp)) != 0; ) {
- if (c == ':') {
- fileword[i] = '\0';
- i = 0;
- if (!strcmp(word, fileword))
- break;
- else {
- while ((c = fgetc(fp)) != 0)
- ;
- if (offsets[STOPWORDPOS] == ftell(fp))
- return NULL;
- continue;
- }
- }
- else
- fileword[i++] = c;
- }
- if (c == 0)
- return NULL;
-
- countnum = 1;
-
- ungetc(c, fp);
- while ((c = fgetc(fp)) != 0) {
- x = 0;
- do {
- c = fgetc(fp);
- if (c == 0)
- return rp;
- x *= 128;
- x += c & 127;
- } while (c & 128);
- if (x) {
- if (countnum == 1) {
- filenum = x;
- countnum++;
- }
- else if (countnum == 2) {
- rank = x;
- countnum++;
- }
- else if (countnum == 3) {
- structure = x;
- rp = (struct result *)
- addtoresultlist(rp, filenum, rank,
- structure);
- countnum = 1;
- }
- }
- }
-
- return rp;
- }
-
- /* Is a word a rule?
- */
-
- int isrule(word)
- char *word;
- {
- if (!strcmp(word, "and") || !strcmp(word, "or") || !strcmp(word, "not"))
- return 1;
- else
- return 0;
- }
-
- /* Is a word a boolean rule?
- */
-
- int isbooleanrule(word)
- char *word;
- {
- if (!strcmp(word, "and") || !strcmp(word, "or"))
- return 1;
- else
- return 0;
- }
-
- /* Is a word a unary rule?
- */
-
- int isunaryrule(word)
- char *word;
- {
- if (!strcmp(word, "not"))
- return 1;
- else
- return 0;
- }
-
- /* Return the number for a rule.
- */
-
- int getrulenum(word)
- char *word;
- {
- if (!strcmp(word, "and"))
- return AND_RULE;
- else if (!strcmp(word, "or"))
- return OR_RULE;
- else if (!strcmp(word, "not"))
- return NOT_RULE;
- return NO_RULE;
- }
-
- /* Takes two lists of results from searches and ANDs them together.
- */
-
- struct result *andresultlists(r1, r2)
- struct result *r1;
- struct result *r2;
- {
- static struct result *tmpnode, *newnode;
-
- if (r1 == NULL || r2 == NULL)
- return NULL;
-
- newnode = NULL;
-
- while (r1 != NULL) {
- tmpnode = r2;
- while (tmpnode != NULL) {
- if (r1->filenum == tmpnode->filenum)
- newnode = (struct result *)
- addtoresultlist(newnode, r1->filenum,
- (r1->rank + tmpnode->rank) / 2,
- r1->structure & tmpnode->structure);
- tmpnode = tmpnode->next;
- }
- r1 = r1->next;
- }
-
- return newnode;
- }
-
- /* Takes two lists of results from searches and ORs them together.
- */
-
- struct result *orresultlists(r1, r2)
- struct result *r1;
- struct result *r2;
- {
- int i;
- struct result *rp;
- static struct result *newnode;
-
- newnode = NULL;
-
- if (r1 == NULL)
- return r2;
- else if (r2 == NULL)
- return r1;
-
- initresulthashlist();
- while (r1 != NULL) {
- mergeresulthashlist(r1->filenum, r1->rank, r1->structure);
- r1 = r1->next;
- }
- while (r2 != NULL) {
- mergeresulthashlist(r2->filenum, r2->rank, r2->structure);
- r2 = r2->next;
- }
- for (i = 0; i < HASHSIZE; i++) {
- rp = resulthashlist[i];
- while (rp != NULL) {
- newnode = (struct result *) addtoresultlist(newnode,
- rp->filenum, rp->rank, rp->structure);
- rp = rp->next;
- }
- }
-
- return newnode;
- }
-
- /* This performs the NOT unary operation on a result list.
- ** NOTed files are marked with a default rank of 1000.
- */
-
- struct result *notresultlist(rp, fp)
- struct result *rp;
- FILE *fp;
- {
- int i, filenums;
- struct result *newp;
-
- newp = NULL;
-
- initmarkentrylist();
- while (rp != NULL) {
- marknum(rp->filenum);
- rp = rp->next;
- }
-
- filenums = getindexfilenum(fp);
- for (i = 1; !ismarked(i) && i < filenums; i++)
- newp = (struct result *) addtoresultlist(newp,
- i, 1000, IN_ALL);
-
- return newp;
- }
-
- /* Adds a file number and rank to a list of results.
- */
-
- struct result *addtoresultlist(rp, filenum, rank, structure)
- struct result *rp;
- int filenum;
- int rank;
- int structure;
- {
- struct result *newnode;
- static struct result *head;
-
- newnode = (struct result *) emalloc(sizeof(struct result));
- newnode->filenum = filenum;
- newnode->rank = rank;
- newnode->structure = structure;
- newnode->next = NULL;
-
- if (rp == NULL)
- rp = newnode;
- else
- head->next = newnode;
-
- head = newnode;
-
- return rp;
- }
-
- /* Adds the results of a search, sorts them by rank.
- */
-
- struct sortresult *addsortresult(sp, rank, fileinfo)
- struct sortresult *sp;
- int rank;
- char *fileinfo;
- {
- if (rank > bigrank)
- bigrank = rank;
-
- if (sp == NULL) {
- sp = (struct sortresult *) emalloc(sizeof(struct sortresult));
- sp->rank = rank;
- sp->fileinfo = (char *) mystrdup(fileinfo);
- sp->left = sp->right = NULL;
- }
- else {
- if (sp->rank < rank)
- sp->left = (struct sortresult *)
- addsortresult(sp->left, rank, fileinfo);
- else
- sp->right = (struct sortresult *)
- addsortresult(sp->right, rank, fileinfo);
- }
-
- return sp;
- }
-
- /* Prints the final results of a search.
- */
-
- void printsortedresults(sp, num)
- struct sortresult *sp;
- float num;
- {
- int rank;
-
- if (sp != NULL) {
- printsortedresults(sp->left, num);
- rank = (int) ((float) sp->rank * num);
- if (rank >= 999)
- rank = 1000;
- if (maxhits) {
- if (maxhits == -1)
- printf("%d %s", (rank <= 0) ? 1 :
- rank, sp->fileinfo);
- else if (maxhits > 0) {
- printf("%d %s", (rank <= 0) ? 1 :
- rank, sp->fileinfo);
- maxhits--;
- }
- }
- printsortedresults(sp->right, num);
- }
- }
-
- /* Reads a compressed line. This is just here for testing, etc.
- */
-
- void getrawindexline(fp)
- FILE *fp;
- {
- int c, inword;
-
- inword = 1;
- while ((c = fgetc(fp)) != EOF) {
- if (c == ':' && inword)
- inword = 0;
- if (!inword) {
- do {
- c = fgetc(fp);
- if (c == 0)
- return;
- } while (c & 128);
- }
- }
- }
-
- /* Does an index file have a readable format?
- */
-
- int isokindexheader(fp)
- FILE *fp;
- {
- char line[MAXSTRLEN];
-
- fseek(fp, 0, 0);
- fgets(line, MAXSTRLEN, fp);
- if (line[strlen(line) - 1] == '\n')
- line[strlen(line) - 1] = '\0';
- if (strcmp(line, INDEXHEADER)) {
- fseek(fp, 0, 0);
- return 0;
- }
- fseek(fp, 0, 0);
- return 1;
- }
-